ggplotly() and plot_ly()plot_geo()We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.
library(tidyverse)
library(plotly)
library(widgetframe)
library(tidytext)
library(skimr)
sb_locs <-
read_csv("starbucks-locations.csv", show_col_types = FALSE)
sb_nutr <-
read_csv("starbucks-menu-nutrition.csv", show_col_types = FALSE)
usa_pop <- read_csv("us_state_pop.csv", show_col_types = FALSE)
usa_states <- read_csv("states.csv", show_col_types = FALSE)
skim(sb_locs)
| Name | sb_locs |
| Number of rows | 25600 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Brand | 0 | 1.00 | 7 | 21 | 0 | 4 | 0 |
| Store Number | 0 | 1.00 | 5 | 12 | 0 | 25599 | 0 |
| Store Name | 0 | 1.00 | 2 | 46 | 0 | 25364 | 0 |
| Ownership Type | 0 | 1.00 | 8 | 13 | 0 | 4 | 0 |
| Street Address | 2 | 1.00 | 1 | 165 | 0 | 25353 | 0 |
| City | 15 | 1.00 | 2 | 29 | 0 | 5469 | 0 |
| State/Province | 0 | 1.00 | 1 | 3 | 0 | 338 | 0 |
| Country | 0 | 1.00 | 2 | 2 | 0 | 73 | 0 |
| Postcode | 1522 | 0.94 | 1 | 9 | 0 | 18887 | 0 |
| Phone Number | 6861 | 0.73 | 1 | 15 | 0 | 18559 | 0 |
| Timezone | 0 | 1.00 | 18 | 30 | 0 | 101 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Longitude | 1 | 1 | -27.87 | 96.84 | -159.46 | -104.66 | -79.35 | 100.63 | 176.92 | ▇▇▂▂▅ |
| Latitude | 1 | 1 | 34.79 | 13.34 | -46.41 | 31.24 | 36.75 | 41.57 | 64.85 | ▁▁▁▇▂ |
skim(sb_nutr)
| Name | sb_nutr |
| Number of rows | 205 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Item | 0 | 1 | 10 | 65 | 0 | 187 | 0 |
| Category | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Calories | 0 | 1 | 257.24 | 158.36 | 0 | 130 | 250 | 380 | 650 | ▇▇▇▆▁ |
| Fat (g) | 0 | 1 | 10.06 | 9.66 | 0 | 0 | 7 | 18 | 37 | ▇▂▃▂▁ |
| Carb. (g) | 0 | 1 | 33.97 | 17.61 | 0 | 21 | 35 | 45 | 80 | ▃▆▇▃▁ |
| Fiber (g) | 0 | 1 | 1.77 | 2.63 | 0 | 0 | 1 | 3 | 21 | ▇▁▁▁▁ |
| Protein (g) | 0 | 1 | 8.19 | 8.19 | 0 | 1 | 6 | 13 | 34 | ▇▃▂▂▁ |
skim(usa_pop)
| Name | usa_pop |
| Number of rows | 55 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| state | 0 | 1 | 4 | 24 | 0 | 55 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| population | 0 | 1 | 5677621 | 6714730 | 56882 | 1344331 | 3751351 | 6515716 | 37253956 | ▇▂▁▁▁ |
skim(usa_states)
| Name | usa_states |
| Number of rows | 51 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| State | 0 | 1 | 4 | 20 | 0 | 51 | 0 |
| Abbreviation | 0 | 1 | 2 | 2 | 0 | 51 | 0 |
sb_usa <- sb_locs |> filter(Country == "US")
sb_locs_state <- sb_usa |>
group_by(`State/Province`) |>
rename(state = 'State/Province') |>
summarize(n_stores = n())
usa_pop_abbr <-
full_join(usa_pop, usa_states,
by = join_by(state == State))
sb_locs_state <- full_join(sb_locs_state,
usa_pop_abbr,
by = join_by(state == Abbreviation))
summary(sb_locs_state)
## state n_stores state.y population
## Length:55 Min. : 8.0 Length:55 Min. : 56882
## Class :character 1st Qu.: 56.5 Class :character 1st Qu.: 1344331
## Mode :character Median : 123.0 Mode :character Median : 3751351
## Mean : 266.8 Mean : 5677621
## 3rd Qu.: 332.0 3rd Qu.: 6515716
## Max. :2821.0 Max. :37253956
## NA's :4
ggplotly for EDAAnswer the following questions:
p1 <-
ggplot(sb_locs_state, aes(x = population, y = n_stores, color = state)) +
geom_point(alpha = 0.8)
ggplotly(p1)
We can see that the values of population and number of stores are roughly linear in the scatterplot.
p2 <- ggplot() +
geom_histogram(
data = subset(sb_nutr, Category == "Food"),
aes(x = Calories, fill = "Food"),
bins = 25,
color = "black",
alpha = 0.5
) +
geom_histogram(
data = subset(sb_nutr, Category == "Drinks"),
aes(x = Calories, fill = "Drinks"),
bins = 25,
color = "black",
alpha = 0.5
) +
labs(title = "Caloric Distribution of Starbucks Menu Items",
x = "Calories", y = "Frequency") +
scale_fill_manual(
values = c("blue", "red"),
name = "Category",
breaks = c("Food", "Drinks"),
labels = c("Food", "Drinks")
)
ggplotly(p2)
Food tend to have a higher average calories in distribution than drinks.
p3 <- sb_nutr |>
unnest_tokens(word, Item) %>%
count(word, sort = TRUE) %>%
head(20) %>%
ggplot(aes(x = fct_reorder(word, n), y = n)) +
geom_col(fill = "blue") +
coord_flip() +
labs(title = "Top 20 Words in Starbucks Menu Items",
x = "Word",
y = "Frequency")
ggplotly(p3)
plot_ly()plot_ly() representing the
relationship between calories and carbssb_nutr %>%
plot_ly(
x = ~ Calories,
y = ~ `Carb. (g)`,
type = 'scatter',
mode = 'markers',
color = ~ Category
)
hovermode = "compare"topwords <- sb_nutr %>%
unnest_tokens(word, Item, token = "words") %>%
count(word, sort = T) %>%
head(10)
sb_nutr %>%
unnest_tokens(word, Item, token = "words") %>%
filter(word %in% topwords$word) %>%
plot_ly(
x = ~ Calories,
y = ~ `Carb. (g)`,
type = 'scatter',
mode = 'markers',
color = ~ Category,
text = ~ paste0(
"Item: ",
word,
"<br>Calories: ",
Calories,
"<br>Carb. (g): ",
`Carb. (g)`
),
hoverinfo = 'text'
) %>%
layout(
title = "Calories vs Carbohydrates",
xaxis = list(title = "Calories"),
yaxis = list(title = "Carb. (g)"),
hovermode = "compare"
)
plot_ly Boxplotsfiltered_data <- sb_nutr %>%
unnest_tokens(word, Item, token = "words") %>%
filter(word %in% topwords$word)
boxplot <- filtered_data %>%
plot_ly(
x = ~ word,
type = "box",
y = ~ Calories,
name = "Calories",
alpha = 0.5
) %>%
add_boxplot(y = ~ `Fat (g)`,
name = "Fat",
alpha = 0.5) %>%
add_boxplot(y = ~ `Carb. (g)`,
name = "Carbohydrates",
alpha = 0.5) %>%
add_boxplot(y = ~ `Fiber (g)`,
name = "Fiber",
alpha = 0.5) %>%
add_boxplot(y = ~ `Protein (g)`,
name = "Protein",
alpha = 0.5) %>%
layout(
title = "Boxplot of Nutritional Contents by Top Words in Item Names",
xaxis = list(title = "Top Words in Item Names"),
yaxis = list(title = "Value"),
hovermode = "compare"
)
boxplot %>% layout(boxmode = "group")
filtered_data %>%
plot_ly(
x = ~ Calories,
y = ~ `Carb. (g)`,
z = ~ `Protein (g)`,
type = "scatter3d",
mode = "markers",
color = ~ word
)
Items with more calories tend to have more carbohydrate and more protein. items whose name includes “sandwich”, (which could be a type of sandwich) tend to have a higher value in all three nutrients.
plot_ly Map# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('steelblue')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create hover text
sb_locs_state$hover <-
with(
sb_locs_state,
paste(
"Number of Starbucks: ",
n_stores,
'<br>',
"State: ",
state.y,
'<br>',
"Population: ",
population
)
)
# Create the map
map1 <- plot_geo(sb_locs_state, locationmode = "USA-states") %>%
add_trace(
z = ~ n_stores,
text = ~ hover,
locations = ~ state,
color = ~ n_stores,
colors = "Purples"
) %>%
layout(title = "Starbucks stores by state", geo = set_map_details)
map2 <- plot_geo(sb_locs_state, locationmode = "USA-states") %>%
add_trace(
z = ~ population,
locations = ~ state,
text = ~ hover,
color = ~ population,
colors = 'Reds'
) %>%
colorbar(title = "Population") %>%
layout(title = "Population by State", geo = set_map_details)
subplot(map1, map2)
It seems that for states with a higher population, the starbucks store count is also higher, such as California, New York, Texas and Florida.